Data input

# nucs <- read.csv("EH23a.softmasked_nuccomp.csv")
# nucs$GC <- rowSums( nucs[, c("C", "c", "G", "g")] )/rowSums( nucs[, c("A", "a", "C", "c", "G", "g", "T", "t")] )
# nucs$GC <- nucs$GC * 100
gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#
gff[1:3, 1:8]
##           V1         V2     V3    V4       V5 V6 V7 V8
## 1 EH23a.chr1 annotation remark     1 65878799  .  .  .
## 2 EH23a.chr1    feature   gene 11068    24494  .  +  .
## 3 EH23a.chr1    feature   mRNA 11068    24494  .  +  .
genes <- gff[ gff[, 3] == "gene", ]
# genes[1:3, 1:8]
gff <- read.table("../Figure1/EH23a.unmasked.fasta.mod.EDTA.intact.gff3", sep = "\t", quote = "\"")
names(gff) <- c("seqid","source","type","start","end","score","strand","phase","attributes")

#gff$chrn <- sub("^.+chr", "", gff$V1)
gff$chrn <- sub("^.+chr", "", gff$seqid)
gff$chrn[ gff$chrn == "X" ] <- 10
gff$chrn <- as.numeric(gff$chrn)

gff$classification <- unlist(lapply(strsplit(gff[,9], split = ";"), function(x){ grep("Classification=", x, value = TRUE) }))
gff$classification <- sub("^Classification=", "", gff$classification)
gff$classification <- sub("^LTR/Copia", "Ty1", gff$classification)
gff$classification <- sub("^LTR/Gypsy", "Ty3", gff$classification)
#
gff[1:8, c(1:8, 10)]
##        seqid source                      type start   end score strand phase
## 1 EH23a.chr1   EDTA             repeat_region 48463 58586     .      -     .
## 2 EH23a.chr1   EDTA   target_site_duplication 48463 48467     .      -     .
## 3 EH23a.chr1   EDTA      long_terminal_repeat 48468 50359     .      -     .
## 4 EH23a.chr1   EDTA Copia_LTR_retrotransposon 48468 58581     .      -     .
## 5 EH23a.chr1   EDTA      long_terminal_repeat 56690 58581     .      -     .
## 6 EH23a.chr1   EDTA   target_site_duplication 58582 58586     .      -     .
## 7 EH23a.chr1   EDTA             repeat_region 67125 71966     .      +     .
## 8 EH23a.chr1   EDTA   target_site_duplication 67125 67129     .      +     .
##   chrn
## 1    1
## 2    1
## 3    1
## 4    1
## 5    1
## 6    1
## 7    1
## 8    1
# Ty3
Ty3 <- gff[ gff$classification == "Ty3", ]
Ty3 <- Ty3[ Ty3$type == "Gypsy_LTR_retrotransposon", ]
#Ty3 <- Ty3[ Ty3$V3 == "Gypsy_LTR_retrotransposon", ]
Ty3[1:8, c(1:8, 10:11)]
##          seqid source                      type   start     end score strand
## 81  EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 2110985 2122540     .      ?
## 87  EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 2180122 2191426     .      +
## 161 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 3614107 3626646     .      -
## 206 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 3905221 3916714     .      ?
## 228 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 4168059 4180534     .      -
## 242 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 4447228 4458755     .      -
## 283 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 4885633 4891899     .      -
## 305 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 5232888 5239051     .      +
##     phase chrn classification
## 81      .    1            Ty3
## 87      .    1            Ty3
## 161     .    1            Ty3
## 206     .    1            Ty3
## 228     .    1            Ty3
## 242     .    1            Ty3
## 283     .    1            Ty3
## 305     .    1            Ty3
#table(gff$classification, gff$V1)
#tetbl <- table(gff$V1, gff$classification)
tetbl <- table(gff$seqid, gff$classification)
tetbl <- rbind(tetbl, colMeans(tetbl))
#tetbl

Ty3$motif <- sub("^motif=", "", unlist(lapply(strsplit(Ty3[, 9], split = ";"), function(x){ grep("motif", x, value = TRUE) })))

my_tbl <- table(Ty3$motif, Ty3$chrn)
#my_tbl[, 1:10]
my_mat <- as.matrix(my_tbl)
#my_mat[, 1:10]

my_mat2 <- my_mat[apply(my_mat, MARGIN = 1, function(x){ sum( x != 0 ) }) == 1, ]

my_mat2 <- my_mat2[sort.int(as.numeric(apply(my_mat2, MARGIN = 1, function(x){ paste(x, collapse = "") })), 
         index.return = T)$ix, ]
my_mat2
##       
##        1 2 3 4 5 6 7 8 9 10
##   TTTA 0 0 0 0 0 0 0 0 1  0
##   TGAG 0 0 0 0 0 0 0 1 0  0
##   TATC 0 0 0 0 0 0 1 0 0  0
##   TATT 0 0 0 0 0 0 1 0 0  0
##   TTCC 0 0 0 0 0 0 1 0 0  0
##   TGTG 0 0 0 0 0 1 0 0 0  0
##   TCAC 0 0 0 0 1 0 0 0 0  0
##   TAGC 0 0 0 0 2 0 0 0 0  0
##   TACC 0 0 0 1 0 0 0 0 0  0
##   TTAG 0 0 0 1 0 0 0 0 0  0
##   TTGC 0 0 0 1 0 0 0 0 0  0
##   TAAT 0 0 1 0 0 0 0 0 0  0
##   TCCA 0 0 1 0 0 0 0 0 0  0
##   TGCT 0 0 1 0 0 0 0 0 0  0
##   TAAC 0 1 0 0 0 0 0 0 0  0
##   TCGT 0 1 0 0 0 0 0 0 0  0
##   TCTG 0 1 0 0 0 0 0 0 0  0
##   TACT 1 0 0 0 0 0 0 0 0  0
##   TGTC 1 0 0 0 0 0 0 0 0  0
##   TTCA 1 0 0 0 0 0 0 0 0  0
# my_cmd <- c("~/gits/hempy/bin/fast_win.py --win_size 1000000 /media/knausb/E737-9B48/releases/EH23a/EH23a.softmasked.fasta")
# my_cmd
# system(my_cmd)
# x <- read.table("EH23a.softmasked_wins.csv", header = T, sep = ",")
# #x[1:80, ]
#wins <- read.csv("EH23a.softmasked_wins1e5.csv")
wins <- read.csv("EH23a.softmasked_wins1e6.csv")
#wins <- read.csv("EH23a.softmasked_wins1e7.csv")
# wins <- read.csv("EH23a.softmasked_wins.csv")
wins$CGs <- 0
for( i in unique(wins$Id) ){
  wins$CGs[ wins$Id == i] <- wins$CG[ wins$Id == i] - min(wins$CG[ wins$Id == i], na.rm = TRUE)
  wins$CGs[ wins$Id == i] <- wins$CGs[ wins$Id == i]/max(wins$CGs[ wins$Id == i], na.rm = TRUE)
}
wins$chrn <- sub(".+chr", "", wins$Id)
wins$chrn[ wins$chrn == "X" ] <- 10
wins$chrn <- as.numeric(wins$chrn)
wins$ATs <- 1 - wins$CGs
#wins[1:3, ]

# 
cwins <- wins[wins$CGs == 1 & !is.na(wins$CGs) , ]
# cwins <- cwins[ !duplicated( cwins$Id ), ]
# cwins$cent <- cwins$Start + cwins$Win_length/2

Centromere genes

gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#gff[gff$V3 == "remark", ]
#genes[ genes$V1 == cwins$Id[i] & genes$V4 >= cwins$Start[i] & genes$V5 <= cwins$End[i], ]
i <- 1
gff2 <- gff[ gff$V1 == cwins$Id[i] & gff$V4 >= cwins$Start[i] & gff$V5 <= cwins$End[i], ]
for( i in 2:nrow(cwins) ){
#  gff2 <- gff[ gff$V1 == cwins$Id[i] & gff$V4 >= cwins$Start[i] & gff$V5 <= cwins$End[i], ]
  gff2 <- rbind(gff2, gff[ gff$V1 == cwins$Id[i] & gff$V4 >= cwins$Start[i] & gff$V5 <= cwins$End[i], ])
}

#cbind(1:nrow(gff2), gff2[, 1:8])
#table(gff2$V3)

#gff2[gff2$V3 == "CDS", ]
#gff2[gff2$V3 == "gene", ]
gff2 <- gff2[gff2$V3 == "mRNA", ]

#unlist(lapply(strsplit(gff2$V9, split = ";"), function(x){ grep("desc", x, value = TRUE) }))
# lndmrk <- read.csv("csat_landmark_EH23a_blastn.csv", header = FALSE)
# names(lndmrk) <- c("qseqid","qlen","sseqid","slen","qstart","qend","sstart","send","evalue","bitscore","score","length","pident","nident","mismatch","positive","gapopen","gaps","ppos","sstrand")
# lndmrk$chrn <- sub("^.+chr", "", lndmrk$sseqid)
# lndmrk$chrn[ lndmrk$chrn == "X" ] <- 10
# lndmrk$chrn[ lndmrk$chrn == "Y" ] <- 11
# lndmrk$chrn <- as.numeric(lndmrk$chrn)
# 
# lndmrk[1:3, ]

Variant data

vars <- read.table("ERBxHO40vars.csv.gz", header = TRUE, sep = ",")
nrow(vars)
## [1] 450759
vars <- vars[vars$Missing == 0, ]
nrow(vars)
## [1] 35186
vars[1:3, ]
##      CHROM    POS   n Allele_counts        He       Ne REFhom HET ALThom
## 48  chr_1e  62350 270        509,31 0.1082236 1.121357    240  29      1
## 102 chr_1e  98473 270       412,128 0.3617010 1.566664    145 122      3
## 220 chr_1e 204226 270       263,277 0.4996639 1.998657     65 133     72
##     Missing
## 48        0
## 102       0
## 220       0
vars$ERBallele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[1]})))
vars$HO40allele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[2]})))

vars$ERBallele <- vars$ERBallele / (vars$n * 2)
vars$HO40allele <- vars$HO40allele / (vars$n * 2)


my_chrom <- "chr_1e"
plot( vars$POS[ vars$CHROM == my_chrom ], vars$ERBallele[ vars$CHROM == my_chrom ],
      pch = 20, col = "#C0C0C066", ylim = c(0, 1))

points( vars$POS[ vars$CHROM == my_chrom ], vars$HO40allele[ vars$CHROM == my_chrom ],
      pch = 20, col = "#4682B4")

vars$chrn <- sub("chr_", "", vars$CHROM)
vars$chrn <- sub("e$", "", vars$chrn)
vars$chrn[ vars$chrn == "X" ] <- 10
vars$chrn <- as.numeric(vars$chrn)
table(vars$chrn)
## 
##    1    2    3    4    5    6    7    8    9   10 
## 2933 3228 4869 5716 3184 2886 2317 1975 4690 3388
library(ggplot2)
ggplot(vars, aes(x=chrn, y=POS, group=CHROM)) +
  geom_point()

my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Set3"), "08", sep="")
#my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Paired"), "08", sep="")

hist(vars$ERBallele[vars$chrn == 2], xlim = c(0, 1))

hist(vars$HO40allele[vars$chrn == 2], xlim = c(0, 1))

p <- ggplot(vars, aes(x=chrn+ERBallele-0.5, y=POS, group=CHROM)) +
  geom_point( aes( color = CHROM ) )
p <- p + scale_color_manual(values=my_cols1)
#p <- p + geom_point( data = vars, aes(x = chrn-HO40allele, y=POS), color = vars$chrn)
#
p <- p + geom_point( data = vars, aes(x = chrn+HO40allele-0.5, y=POS), color = my_cols1[vars$chrn])
#p <- p + geom_point( data = vars, aes(x = chrn-0, y=POS), color = my_cols1[vars$chrn])
#p <- p + scale_color_manual(values=my_cols1)
p <- p + theme_bw()
p <- p + theme(legend.position='none')
p <- p + scale_x_continuous(breaks=seq(1, 10, 1))
p <- p + xlab("Chromosome")
p

#p + scale_color_brewer(palette="Dark2")
#p + scale_color_brewer(palette="Set3")
#  geom_point( aes( color = CHROM, alpha = 1/10 ) )
#  geom_point( aes( alpha = 1/10 ) )

Windowize

# wins[1:3, ]
# genes[1:3, 1:8]

wins$gcnt <- 0
wins$ty1cnt <- 0
wins$ty3cnt <- 0
for(i in 1:nrow(wins)){
   tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
   wins$gcnt[i] <- nrow(tmp)

   tmp <- gff[gff$V1 == wins$Id[i] & gff$V4 >= wins$Start[i] & gff$V5 < wins$End[i], ]

   wins$ty1cnt[i] <- sum(tmp$classification == "Ty1")
   wins$ty3cnt[i] <- sum(tmp$classification == "Ty3")
}

wins$gcntsc <- wins$gcnt - min(wins$gcnt)
wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
#range(round( wins$gcntsc * 100 ))
my_index <- round( wins$gcntsc * 100 )
#my_index <- 100 - my_index
my_index[ my_index <= 0] <- 1
# 
# #wins$gcol <- heat.colors(n=100)[ my_index ]
# #wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
# #wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
# #wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
# #wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]
# 
# 
# #wins$gcol <- viridisLite::plasma(n = 100, alpha = 1, begin = 0.1, end = 1)[ my_index ]
# wins$gcol <- viridisLite::magma(n = 100, alpha = 1, begin = 0.2, end = 1.00)[ my_index ]
# 
# wins$chr <- sub("^.+chr", "chr", wins$Id)
# #
# wins[1:3, ]
# Cannabinoid BLAST results.

# cann_blst <- read.csv("cann_3dom_EH23a_tblastn.csv", header = FALSE)
# colnames(cann_blst) <- c('qseqid','qlen','sseqid','slen','qstart',
#                          'qend','sstart','send','evalue','bitscore',
#                          'score','length','pident','nident','mismatch',
#                          'positive','gapopen','gaps','ppos','sstrand',
#                          'sseq')
# #
# cann_blst <- cann_blst[grep("SignalP|FAD|BBE", cann_blst$qseqid, invert = TRUE, value = F), ]
# cann_blst$name <- sub("^.+_", "", cann_blst$qseqid)
# 
# # table(cann_blst$qlen)
# cann_blst <- cann_blst[ cann_blst$gaps <= 0, ]
# cann_blst <- cann_blst[ cann_blst$mismatch <= 10, ]
# 
# # write.table(cann_blst, file = "cann_3dom_EH23a_tblastn_filter.csv",
# #             sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE)
# # system("~/gits/hempy/bin/blast_to_gff.py cann_3dom_EH23a_tblastn_filter.csv")
# 
# cann_blst$chrn <- as.numeric(sub(".+chr", "", cann_blst$sseqid))
# 
# #table(cann_blst$qseqid)
# 
# # cann_blst[1:3, 1:10]
# # cann_blst[1:3, 10:20]
# 
# knitr::kable(cann_blst[, c(1, 2, 3, 7, 13:15, 18, 23)], caption = "**Table 1. Cannabinoid synthase genes.**")
# 
#nrow(cann_blst)
# blst <- read.csv("EH23a_blastn.csv", header = FALSE)
# colnames(blst) <- c('qseqid','qlen','sseqid','slen','qstart',
#                     'qend','sstart','send','evalue','bitscore',
#                     'score','length','pident','nident','mismatch',
#                     'positive','gapopen','gaps','ppos','sstrand',
#                     #'staxids','sblastnames','salltitles',
#                     'sseq')#[1:21]
# 
# #blst$sseqid <- factor(blst$sseqid, levels = c("EH23a.chr1", "EH23a.chr2", "EH23a.chr3", "EH23a.chr4", "EH23a.chr5", "EH23a.chr6", "EH23a.chr7", "EH23a.chr8", "EH23a.chr9", "EH23a.chrX"))
# #blst <- blst[blst$qseqid == "CsatSD_centromere_370bp", ]
# #table(blst$qseqid)
# # blst[1:3, 1:10]
# 
# blst$chrn <- sub(".+chr", "", blst$sseqid)
# blst$chrn[ blst$chrn == "X" ] <- 10
# blst$chrn <- as.numeric(blst$chrn)
# 
# subt <- blst[grep("CsatSD_centromere_370bp", blst$qseqid), ]
# cent <- blst[grep("CsatSD_centromere_237bp", blst$qseqid), ]
# 
# #blst
# #knitr::kable(blst[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
# knitr::kable(subt[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")

Ideo function

# plot_ideo <- function() {
#   suppressPackageStartupMessages(require(ggplot2))
# 
#   # marker_df <- data.frame(
#   #   chrom = rep(names(map), times = lapply(map, length)),
#   #   pos = unlist(map),
#   #   marker = names(unlist(map))
#   # )
#   # marker_df$chromf <- factor( marker_df$chrom, levels = names(map) )
#   # marker_df$chromn <- as.numeric( marker_df$chromf )
# 
#   # chr_df <- data.frame(
#   #   start = unlist(lapply(map, min)),
#   #   end = unlist(lapply(map, max))
#   # )
#   # chr_df$chr <- names(map)
#   # chr_df$chrf <- factor( chr_df$chr, levels = names(map))
#   # chr_df$chrn <- as.numeric( chr_df$chrf )
# 
#   chr_df <- data.frame(
#     start = 1,
#     end = nucs$Length,
#     chr = nucs$Id
#   )
#   chr_df$chrn <- sub(".+chr", "", nucs$Id)
#   chr_df$chrn[chr_df$chrn == "X"] <- 10
#   chr_df$chrn <- as.numeric(chr_df$chrn)
#   
#   
#   chrom_wid <- 0.02
#   p <- ggplot2::ggplot()
#   p <- p + ggplot2::geom_rect( data = chr_df, 
#                                ggplot2::aes( xmin = chrn - chrom_wid,
#                                   xmax = chrn + chrom_wid,
#                                   #xmin = as.numeric(as.factor(chr)) - chrom_wid,
#                                   #xmax = as.numeric(as.factor(chr)) + chrom_wid,
#                                   ymin = end, ymax = start), 
#                       #fill = "#C0C0C0",
#                       fill = "#DCDCDC",
#                       #fill = "#F5F5F5",
#                       color = "#000000"
#                       )
#   #p <- p + scale_y_reverse(limits = c(max_gd, 0))
#   
#   #wins$Id
#   thinw <- 0.28
#   p <- p + ggplot2::geom_rect( 
#     data = wins, 
#     ggplot2::aes( 
#       # xmin = chrn - CGs,
#       # xmax = chrn + CGs,
#       xmin = chrn - ATs * thinw,
#       xmax = chrn + ATs * thinw,
#       ymin = Start, 
#       ymax = End),
#     fill = wins$gcol,
#     #fill = "#0000CD",
#     #fill = "#A9A9A9",
#     #fill = "#C0C0C0",
#     #fill = "#DCDCDC",
#     #fill = "#F5F5F5",
# #                  color = "#000000"
#     color = NA
#     )
#   #p
# 
#   cmwidth <- 0.4
#   p <- p + ggplot2::geom_rect( 
#     data = cann_blst, 
#     ggplot2::aes(
#       xmin = chrn - cmwidth,
#       xmax = chrn + cmwidth,
#       ymin = sstart, 
#       ymax = send),
#     #fill = "#0000CD",
#     #fill = "#A9A9A9",
#     fill = "#C0C0C0",
#     #fill = "#DCDCDC",
#     #fill = "#F5F5F5",
#     #color = "#000000"
#     color = "#228B22"
#     #color = NA
#     )
#   #p
#   
# 
#   # p <- p + annotate(geom="text", x=7.5, y=11.3e6, label="THCAS1",
#   #             color="#228B22", size = 3)
#   # p <- p + annotate(geom="text", x=7.5, y=12.1e6, label="CBDAS2",
#   #             color="#0000FF", size = 3)
#   # p <- p + annotate(geom="text", x=7.5, y=30.7e6, label="CBCAS",
#   #             color="#B22222", bg = "#ffffff", size = 3)
#   #p
#   
#   stwidth <- 0.40
#   p <- p + ggplot2::geom_rect( 
#     data = subt, 
#     ggplot2::aes(
#       xmin = chrn - stwidth,
#       xmax = chrn + stwidth,
#       #xmax = chrn,
#       ymin = sstart, 
#       ymax = send),
#     #fill = "#0000CD",
#     #fill = "#A9A9A9",
#     fill = "#C0C0C0",
#     #fill = "#DCDCDC",
#     #fill = "#F5F5F5",
#     #color = "#000000"
#     color = "#0000FF"
#     #color = "#228B22"
#     #color = NA
#     )
#   #p
#   
#   mwidth <- 0.6
#   p <- p + ggplot2::geom_rect( 
#     data = cent, 
#     ggplot2::aes(
#       xmin = chrn - mwidth,
# #      xmax = chrn + mwidth,
#       xmax = chrn - 0.2,
#       ymin = sstart, 
#       ymax = send),
#     #fill = "#0000CD",
#     #fill = "#A9A9A9",
#     fill = "#C0C0C0",
#     #fill = "#DCDCDC",
#     #fill = "#F5F5F5",
#     color = "#000000"
#     #color = "#0000FF"
#     #color = "#8A2BE2"
#     #color = "#228B22"
#     #color = NA
#     )
#   #p
#   
# 
# #   mwidth <- 0.4
# #   p <- p + ggplot2::geom_rect( 
# #     data = Ty3, 
# #     ggplot2::aes(
# #       xmin = chrn - 0,
# # #      xmin = chrn - mwidth,
# #       xmax = chrn + mwidth,
# # #      xmax = chrn,
# #       ymin = V4, 
# #       ymax = V5),
# #     fill = "#0000FF",
# #     color = NA
# #     )
# #   #p
#   
#   
#   # marker_wid <- 0.1
#   # marker_high <- 0.4
#   # p <- p + ggplot2::geom_rect( data = marker_df, 
#   #                     ggplot2::aes( xmin = chromn - marker_wid,
#   #                          xmax = chromn + marker_wid, 
#   #                          #xmin = as.numeric(as.factor(chrom)) - marker_wid,
#   #                          #xmax = as.numeric(as.factor(chrom)) + marker_wid,
#   #                          ymin = pos - marker_high, ymax = pos + marker_high),
#   #                     fill = "#228B22", color = "#228B22"
#   # )
#   # 
#   # p <- p + scale_y_reverse( breaks = seq(0, 2e3, by = 100) )
#   #p <- p + ggplot2::scale_y_reverse( minor_breaks = seq(0, 2e3, by = 20), breaks = seq(0, 2e3, by = 100) )
#   #p <- p + scale_x_continuous( breaks = as.numeric(as.factor(chr_df$chr) ) )
#   p <- p + ggplot2::scale_x_continuous( breaks = chr_df$chrn, labels = chr_df$chr )
#   # p <- p + scale_y_reverse(limits = c(max_gd, 0))
#   # p <- p + scale_y_reverse(limits = c(0, max_gd))
#   #p <- p + theme_bw() + theme( panel.grid.minor = element_blank() )
#   p <- p + ggplot2::theme_bw() + 
#     ggplot2::theme( panel.grid.minor.x = ggplot2::element_blank(), 
#            axis.text.x = element_text(angle = 60, hjust = 1),
#            axis.title.x=element_blank(),
#            panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
#            panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
#            #panel.grid.major.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 1 ),
#            #panel.grid.minor.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 3 )
#           )
# #  p <- p + ggplot2::xlab("Chromosome")
# 
#   #p <- p + ylab("Distance (cM)")
#   #p <- p + ggplot2::ylab("Location (cM)")
#   p <- p + ggplot2::ylab("Position (bp)")
#   #p
#   
# 
#   #p
# 
#   #return( invisible( NULL ) )
#   p
# }
# p1 <- plot_ideo()
# p1

Add

#lndmrk[1:3, ]
#nrow(lndmrk)
#table(lndmrk$qseqid)

# pmildew <- "PNW39_LH3804_locus"
# lgrp <- system( 'grep "^>" ../csat_landmark_dna.fa', intern = TRUE )
# lgrp <- sub("^>", "", lgrp)
# #lgrp
# 
# # Cann
# my_regex <- sub("[[:space:]].+", "", lgrp[1:6])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# Cent, subTelo
#lgrp[7:9]
# my_regex <- sub("[[:space:]].+", "", lgrp[7:9])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# table(lndmrk2$qseqid)
# 
# chrom_wid <- 0.46
# tmp <- data.frame( 
#   xmin = lndmrk2$chrn - chrom_wid, 
#   xmax = lndmrk2$chrn + chrom_wid, 
#   ymin = lndmrk2$sstart, 
#   ymax = lndmrk2$send
#   )
# 
# p1 + ggplot2::geom_rect( 
#   data = tmp, 
#       ggplot2::aes( 
#         xmin = xmin,
#         xmax = xmax,
#         ymin = ymin, 
#         ymax = ymax
#         ),
#       fill = "#DCDCDC",
# #      color = "#CD853F",
#       color = "#F08080",
# #      color = "#000000"
#       linewidth = 2
#       ) + ggtitle("Sub-telo, centromeric motifs")
# P mildew
#lgrp[10:10]
# my_regex <- sub("[[:space:]].+", "", lgrp[10:10])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# 
# chrom_wid <- 0.46
# tmp <- data.frame( 
#   xmin = lndmrk2$chrn - chrom_wid, 
#   xmax = lndmrk2$chrn + chrom_wid, 
#   ymin = lndmrk2$sstart, 
#   ymax = lndmrk2$send
#   )
# 
# p1 + ggplot2::geom_rect( 
#   data = tmp, 
#       ggplot2::aes( 
#         xmin = xmin,
#         xmax = xmax,
#         ymin = ymin, 
#         ymax = ymax
#         ),
#       fill = "#DCDCDC",
# #      color = "#CD853F",
#       color = "#F08080",
# #      color = "#000000"
#       linewidth = 2
#       ) + ggtitle("Powdery mildew")
# # CENP
# grep("NC_044371.1:c59909808-59908167 Cannabis sativa chromosome 1, cs10, whole genome shotgun sequence", lgrp):grep("NC_044371.1:c74285628-74284242 Cannabis sativa chromosome 1, cs10, whole genome shotgun sequence", lgrp)
# #lgrp[11:26]
# my_regex <- sub("[[:space:]].+", "", lgrp[11:26])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# 
# chrom_wid <- 0.46
# tmp <- data.frame( 
#   xmin = lndmrk2$chrn - chrom_wid, 
#   xmax = lndmrk2$chrn + chrom_wid, 
#   ymin = lndmrk2$sstart, 
#   ymax = lndmrk2$send
#   )
# 
# p1 + ggplot2::geom_rect( 
#   data = tmp, 
#       ggplot2::aes( 
#         xmin = xmin,
#         xmax = xmax,
#         ymin = ymin, 
#         ymax = ymax
#         ),
#       fill = "#DCDCDC",
# #      color = "#CD853F",
#       color = "#F08080",
# #      color = "#000000"
#       linewidth = 2
#       ) + ggtitle("CENP")
# # rRNA
# grep("NC_044375.1:87745002-87745120 Cannabis sativa chromosome 2, cs10, whole genome shotgun sequence", lgrp):grep("NC_026562.1:100368-101858 Cannabis sativa cultivar Carmagnola chloroplast, complete genome", lgrp)
# #lgrp[27:55]
# my_regex <- sub("[[:space:]].+", "", lgrp[27:55])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# 
# lndmrk2 <- lndmrk2[ lndmrk2$gaps == 0, ]
# lndmrk2 <- lndmrk2[ lndmrk2$pident >= 95, ]
# 
# #lndmrk2 <- lndmrk2[grep("^NC_044", lndmrk2$qseqid), ]
# 
# tbl2 <- table(lndmrk2$chrn, lndmrk2$qseqid)
# tbl2[, colSums(as.matrix(tbl2)) > 1e3]
# #tbl2['8',]
# 
# #tbl2[, grep("^NW", colnames(tbl2))]
# 
# table(lndmrk2$chrn)
# 
# # OAS
# #lgrp[56:57]
# #my_regex <- sub("[[:space:]].+", "", lgrp[56:57])
# #my_regex <- paste(my_regex, collapse="|")
# #lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# 
# 
# #tmp <- data.frame( xmin = 1:4, xmax = 2:5, ymin = rep(10, times = 4), ymax = rep(10000000, times = 4))
# 
# chrom_wid <- 0.46
# tmp <- data.frame( 
#   xmin = lndmrk2$chrn - chrom_wid, 
#   xmax = lndmrk2$chrn + chrom_wid, 
#   ymin = lndmrk2$sstart, 
#   ymax = lndmrk2$send
#   )
# 
# p1 + ggplot2::geom_rect( 
#   data = tmp, 
#       ggplot2::aes( 
#         xmin = xmin,
#         xmax = xmax,
#         ymin = ymin, 
#         ymax = ymax
#         ),
#       #fill = "#C0C0C0",
#       fill = "#DCDCDC",
#       #fill = "#F5F5F5",
# #      color = "#FFD700"
# #      color = "#DAA520",
#       color = "#CD853F",
# #      color = "#000000"
#       linewidth = 2
#       ) + ggtitle("rRNA")
# 
# #ggsave(filename = "EH23a_ideo.png", device = "png", width = 6.5, height = 6.5, units = "in", dpi = 300)
# 
# # OAS
# #lgrp[56:57]
# my_regex <- sub("[[:space:]].+", "", lgrp[56:57])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# 
# chrom_wid <- 0.46
# tmp <- data.frame( 
#   xmin = lndmrk2$chrn - chrom_wid, 
#   xmax = lndmrk2$chrn + chrom_wid, 
#   ymin = lndmrk2$sstart, 
#   ymax = lndmrk2$send
#   )
# 
# p1 + ggplot2::geom_rect( 
#   data = tmp, 
#       ggplot2::aes( 
#         xmin = xmin,
#         xmax = xmax,
#         ymin = ymin, 
#         ymax = ymax
#         ),
#       fill = "#DCDCDC",
# #      color = "#CD853F",
#       color = "#F08080",
# #      color = "#000000"
#       linewidth = 2
#       ) + ggtitle("OAS")

Chrom by genes

my_data <- readr::read_csv("gene_counts.csv", )
## Rows: 69 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Sample
## dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX, chrY
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# names(my_data)[1] <- "Sample"
# my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
##    Sample  chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8  chr9  chrX  chrY
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AH3Ma   4052  2763  2586  3189  2424  2535  2058  2892  2610  3689    NA
##  2 AH3Mb   4062  2807  2620  3224  2563  2604  2080  2967  2509    NA  2850
##  3 BCMa    4137  2878  2682  3256  2597  2599  2031  2953  2654  3792    NA
##  4 BCMb    4059  2703  2611  3098  2601  2615  1986  2905  2569    NA  2751
##  5 BOAXa   4097  2877  2680  3147  2444  2718  2015  2950  2612  3778    NA
##  6 BOAXb   4039  2816  2507  3159  2499  2586  1995  2914  2551  3670    NA
##  7 COFBa   4365  2681  2552  3035  2404  2457  1793  2840  2483  3625    NA
##  8 COFBb   4078  2960  2755  3300  2768  2750  2212  3058  2769  3944    NA
##  9 COSVa   4063  2834  2677  3130  2595  2580  2048  2940  2594  3722    NA
## 10 COSVb   4049  2846  2690  3175  2601  2645  2029  2977  2566  3801    NA
## # ℹ 59 more rows
library(tidyr)
data_long <- my_data %>%
  pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Count")
#data_long$Length <- data_long$Length / 1e6
gcount <- data_long
sum(is.na(data_long$Count))
## [1] 69
data_long <- data_long[!is.na(data_long$Count), ]
data_long
## # A tibble: 690 × 3
##    Sample Chrom Count
##    <chr>  <chr> <dbl>
##  1 AH3Ma  chr1   4052
##  2 AH3Ma  chr2   2763
##  3 AH3Ma  chr3   2586
##  4 AH3Ma  chr4   3189
##  5 AH3Ma  chr5   2424
##  6 AH3Ma  chr6   2535
##  7 AH3Ma  chr7   2058
##  8 AH3Ma  chr8   2892
##  9 AH3Ma  chr9   2610
## 10 AH3Ma  chrX   3689
## # ℹ 680 more rows
my_data <- readr::read_csv("chrom_lengths.csv")
## New names:
## Rows: 69 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX,
## chrY
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(my_data)[1] <- "Sample"
my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
##    Sample   chr1   chr2   chr3   chr4   chr5   chr6   chr7   chr8   chr9    chrX
##    <fct>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
##  1 AH3Ma  6.57e7 7.65e7 8.20e7 8.16e7 7.65e7 7.63e7 6.64e7 5.41e7 6.60e7  8.42e7
##  2 AH3Mb  6.63e7 7.62e7 8.11e7 8.09e7 7.80e7 7.64e7 6.65e7 5.52e7 6.50e7 NA     
##  3 BCMa   6.51e7 7.58e7 8.62e7 8.12e7 7.67e7 7.53e7 6.55e7 5.48e7 6.49e7  8.33e7
##  4 BCMb   6.32e7 7.83e7 8.16e7 7.84e7 7.69e7 7.68e7 6.44e7 5.42e7 6.55e7 NA     
##  5 BOAXa  6.53e7 7.96e7 8.19e7 7.80e7 7.64e7 7.91e7 6.40e7 5.48e7 6.53e7  8.32e7
##  6 BOAXb  6.50e7 7.76e7 7.95e7 7.97e7 7.57e7 7.58e7 6.49e7 5.44e7 6.60e7  8.26e7
##  7 COFBa  7.21e7 7.85e7 8.44e7 8.21e7 7.76e7 7.97e7 6.37e7 5.57e7 6.55e7  8.57e7
##  8 COFBb  6.47e7 7.71e7 8.21e7 8.16e7 7.77e7 7.71e7 6.84e7 5.52e7 6.63e7  8.46e7
##  9 COSVa  6.76e7 8.34e7 8.63e7 8.52e7 8.09e7 7.93e7 6.99e7 5.53e7 6.81e7  8.65e7
## 10 COSVb  6.61e7 8.15e7 8.74e7 8.40e7 8.46e7 8.30e7 6.50e7 5.75e7 6.78e7  8.58e7
## # ℹ 59 more rows
## # ℹ 1 more variable: chrY <dbl>
library(tidyr)
data_long <- my_data %>%
  pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Length")
data_long$Length <- data_long$Length / 1e6
data_long
## # A tibble: 759 × 3
##    Sample Chrom Length
##    <fct>  <chr>  <dbl>
##  1 AH3Ma  chr1    65.7
##  2 AH3Ma  chr2    76.5
##  3 AH3Ma  chr3    82.0
##  4 AH3Ma  chr4    81.6
##  5 AH3Ma  chr5    76.5
##  6 AH3Ma  chr6    76.3
##  7 AH3Ma  chr7    66.4
##  8 AH3Ma  chr8    54.1
##  9 AH3Ma  chr9    66.0
## 10 AH3Ma  chrX    84.2
## # ℹ 749 more rows
glength <- data_long
glength
## # A tibble: 759 × 3
##    Sample Chrom Length
##    <fct>  <chr>  <dbl>
##  1 AH3Ma  chr1    65.7
##  2 AH3Ma  chr2    76.5
##  3 AH3Ma  chr3    82.0
##  4 AH3Ma  chr4    81.6
##  5 AH3Ma  chr5    76.5
##  6 AH3Ma  chr6    76.3
##  7 AH3Ma  chr7    66.4
##  8 AH3Ma  chr8    54.1
##  9 AH3Ma  chr9    66.0
## 10 AH3Ma  chrX    84.2
## # ℹ 749 more rows
gcount
## # A tibble: 759 × 3
##    Sample Chrom Count
##    <chr>  <chr> <dbl>
##  1 AH3Ma  chr1   4052
##  2 AH3Ma  chr2   2763
##  3 AH3Ma  chr3   2586
##  4 AH3Ma  chr4   3189
##  5 AH3Ma  chr5   2424
##  6 AH3Ma  chr6   2535
##  7 AH3Ma  chr7   2058
##  8 AH3Ma  chr8   2892
##  9 AH3Ma  chr9   2610
## 10 AH3Ma  chrX   3689
## # ℹ 749 more rows
data_long <- cbind(glength, gcount$Count)
names(data_long)[4] <- "Count"
data_long[1:3, ]
##   Sample Chrom   Length Count
## 1  AH3Ma  chr1 65.67137  4052
## 2  AH3Ma  chr2 76.48150  2763
## 3  AH3Ma  chr3 81.99513  2586
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")

library(ggplot2)
# Basic scatter plot
p <- ggplot(data_long, aes(x=Length, y=Count, color=Chrom)) 
p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)

p <- p + theme_bw()
p <- p + theme(legend.title = element_blank()) 
#p <- p + theme(legend.position = "left")
p <- p + theme(legend.position = "right")
#p <- p + theme( legend.spacing.y = unit(1.0, 'mm') )
## important additional element
#p <- p + guides(fill = guide_legend(byrow = TRUE))
p + theme(legend.spacing.y = unit(1.0, 'mm')) +
  guides(fill = guide_legend(byrow = TRUE))
## `geom_smooth()` using formula = 'y ~ x'

#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Chromosome length (Mbp)")
p <- p + ylab("Genes per chromosome")



p2 <- p
p2
## `geom_smooth()` using formula = 'y ~ x'

TE plot

wins <- read.csv("EH23a.softmasked_wins1e6.csv")

gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#gff[1:3, 1:8]
genes <- gff[ gff[, 3] == "gene", ]
gff <- read.table("../Figure1/EH23a.unmasked.fasta.mod.EDTA.intact.gff3", sep = "\t", quote = "\"")
names(gff) <- c("seqid","source","type","start","end","score","strand","phase","attributes")

#gff$chrn <- sub("^.+chr", "", gff$V1)
gff$chrn <- sub("^.+chr", "", gff$seqid)
gff$chrn[ gff$chrn == "X" ] <- 10
gff$chrn <- as.numeric(gff$chrn)

gff$classification <- unlist(lapply(strsplit(gff[,9], split = ";"), function(x){ grep("Classification=", x, value = TRUE) }))
gff$classification <- sub("^Classification=", "", gff$classification)
gff$classification <- sub("^LTR/Copia", "Ty1", gff$classification)
gff$classification <- sub("^LTR/Gypsy", "Ty3", gff$classification)
#
gff[1:8, c(1:8, 10)]
##        seqid source                      type start   end score strand phase
## 1 EH23a.chr1   EDTA             repeat_region 48463 58586     .      -     .
## 2 EH23a.chr1   EDTA   target_site_duplication 48463 48467     .      -     .
## 3 EH23a.chr1   EDTA      long_terminal_repeat 48468 50359     .      -     .
## 4 EH23a.chr1   EDTA Copia_LTR_retrotransposon 48468 58581     .      -     .
## 5 EH23a.chr1   EDTA      long_terminal_repeat 56690 58581     .      -     .
## 6 EH23a.chr1   EDTA   target_site_duplication 58582 58586     .      -     .
## 7 EH23a.chr1   EDTA             repeat_region 67125 71966     .      +     .
## 8 EH23a.chr1   EDTA   target_site_duplication 67125 67129     .      +     .
##   chrn
## 1    1
## 2    1
## 3    1
## 4    1
## 5    1
## 6    1
## 7    1
## 8    1
# Ty3
Ty3 <- gff[ gff$classification == "Ty3", ]
Ty3 <- Ty3[ Ty3$type == "Gypsy_LTR_retrotransposon", ]
#Ty3 <- Ty3[ Ty3$V3 == "Gypsy_LTR_retrotransposon", ]
Ty3[1:8, c(1:8, 10:11)]
##          seqid source                      type   start     end score strand
## 81  EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 2110985 2122540     .      ?
## 87  EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 2180122 2191426     .      +
## 161 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 3614107 3626646     .      -
## 206 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 3905221 3916714     .      ?
## 228 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 4168059 4180534     .      -
## 242 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 4447228 4458755     .      -
## 283 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 4885633 4891899     .      -
## 305 EH23a.chr1   EDTA Gypsy_LTR_retrotransposon 5232888 5239051     .      +
##     phase chrn classification
## 81      .    1            Ty3
## 87      .    1            Ty3
## 161     .    1            Ty3
## 206     .    1            Ty3
## 228     .    1            Ty3
## 242     .    1            Ty3
## 283     .    1            Ty3
## 305     .    1            Ty3
wins$gcnt <- 0
#wins$ty1cnt <- 0
wins$ty3cnt <- 0
for(i in 1:nrow(wins)){
   tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
   wins$gcnt[i] <- nrow(tmp)
   #tmp <- gff[gff$V1 == wins$Id[i] & gff$V4 >= wins$Start[i] & gff$V5 < wins$End[i], ]
   tmp <- Ty3[Ty3$seqid == wins$Id[i] & Ty3$start >= wins$Start[i] & Ty3$end < wins$End[i], ]
   wins$ty3cnt[i] <- nrow(tmp)
#   wins$ty1cnt[i] <- sum(tmp$classification == "Ty1")
}
wins[1:3, ]
##           Id Win_number   Start     End Win_length      A      C      G      T
## 1 EH23a.chr1          0       0  999999    1000000 339510 159349 157789 343352
## 2 EH23a.chr1          1 1000000 1999999    1000000 346184 152100 152139 349577
## 3 EH23a.chr1          2 2000000 2999999    1000000 343278 155236 154692 346794
##      CG   CHG   CHH gcnt ty3cnt
## 1 14003 19695 90544  156      0
## 2 13018 18441 88693  141      0
## 3 14100 19606 89229  123      2
hist(wins$ty3cnt)

#plot(wins$gcnt, wins$ty3cnt)
#plot(wins$gcnt, wins$ty1cnt)
# wins[1:3, ]
# table(wins$Id)

# my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
# #my_pal[11] <- "#B15928"
# my_pal[11] <- "#C71585"
# #my_pal <- paste(my_pal, "66", sep = "")

my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")

lm1 <- lm( wins$ty3cnt ~ wins$gcnt )
summary(lm1)
## 
## Call:
## lm(formula = wins$ty3cnt ~ wins$gcnt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4540 -1.9798  0.0355  1.7483 10.8191 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.453983   0.182058   51.93   <2e-16 ***
## wins$gcnt   -0.067007   0.003699  -18.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.905 on 742 degrees of freedom
## Multiple R-squared:  0.3066, Adjusted R-squared:  0.3057 
## F-statistic: 328.1 on 1 and 742 DF,  p-value: < 2.2e-16
my_coefs <- round(lm1$coefficients, digits = 3)

#wins[1:3, ]

library(ggplot2)
# Basic scatter plot
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=Id))
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=chr))
p <- ggplot( data = wins, aes(x=gcnt, y=ty3cnt, color = Id))
#p <- p + geom_point(size=2, color = "#C0C0C0")
#p <- p + geom_point(size=2, color = "#77889966")
# p <- p + geom_point(size=2, color = "#70809044")
#
p <- p + geom_point(size=2)

#p <- p + geom_smooth(method=lm)
#p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 2)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)

#p <- p + geom_text(x=100, y=90, label="y = (-0.4)x + 56.7", size = 4)
p <- p + geom_text(x=100, y=15, label= paste("y = ", my_coefs[2], "x + ", my_coefs[1], sep = ""), color = "#000000", size = 4, parse = F)

p <- p + theme_bw()
p <- p + theme(legend.position = "none")
#p <- p + theme(legend.title = element_blank()) 
#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
# p <- p + xlab("Genes per 1 Mbp window")
# p <- p + ylab("Ty3 elements per 1 Mbp window")
p <- p + xlab("Genes per window")
p <- p + ylab("Ty3 elements window")
# p

p3 <- p
p3
## `geom_smooth()` using formula = 'y ~ x'

# lm2 <- lm( wins$ty1cnt ~ wins$gcnt )
# summary(lm2)

Graphic

source("../FigureSideo/Rfunctions.R")
p1 <- plot_ideo("/media/knausb/E737-9B48/releases/scaffolded//EH23a")
#library(magick)
#ggdraw() +
#  cowplot::draw_image("https://i.stack.imgur.com/WDOo4.jpg?s=328&g=1") #+
#  draw_plot(my_plot)

Figure 1 (Composite plot)

library("ggpubr")

ggarrange(
  plotlist = list(p1, 
                  ggarrange(p3, p2, 
                            ncol = 2, nrow = 1, 
                            widths = c(2, 3),
                            labels = c("B", "C"))
                  ),
  labels = c("A", ""),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.

Figure 1. Genomic architecture of the Cannabis sativa genome. (A) Ideogram of the ERBxHO40_23 haplotype A genome. Each chromosome is divided into 1 Mbp windows where each window’s width is proportional to the inverse abundance of the motif ‘CG’ and each window is colored according to gene density. Windows with high ‘CG’ abundance are narrow and windows with high gene counts are yellow. The plant ERBxHO40_23 resulted from a cross between strains ERB (chemotype III or CBD dominant) and HO40 (chemotype I or THC dominant). The genetically female plant HO40 (XX) was chemically masculinized to produce pollen flowers to facilitate the cross. Subtelomeric repeat motifs are marked with blue horizontal lines. Cannabinoid synthase genes are marked with green horizontal lines on chromosome seven. (B) The abundances of genes and Ty3 LTRs are inversely related. Each dot represents a 1 Mbp window from the ERBxHO40_23 haplotype A assembly, organized by the number of genes and Ty3 LTR elements contained in each window. Windows with high quantities of genes have low Ty3 LTR counts. The genes in ERBxHO40 are predominantly near the ends of each chromosome, while the central portion of each chromosome appears populated by Ty3 LTR elements. (C) Long chromosomes have more genes, however some chromosomes demonstrate exceptional gene content. Chromosome Y is the longest chromosome but includes a number of genes that is comparable to other chromosomes. Each dot represents a chromosome from phased and assembled haplotypes (n = 69; X = 65; Y = 4). Chromosome 1 is of average length but contains more genes than other chromosomes of a similar length. Chromosomes X and 8 also have more genes than chromosomes of similar length. Chromosome 7, where the cannabinoid synthases (CBCAS, CBDAS, THCAS) are found, has the lowest number of genes.

Allele frequencies

afreq <- read.csv("/media/knausb/Vining_lab/knausb/GENOMES_NRQ_2267/ERBxHO40_hapERB/het_wins/allele_freqs.csv")
#hist(afreq$countNA)
#afreq <- afreq[afreq$countNA <= 2, ]
#
afreq <- afreq[afreq$countNA <= 0, ]
afreq[1:3, ]
##     CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1
## 13 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29
## 30 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122
## 54 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133
##    count1.1 countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2
## 13        1       0      509        31 0.9425926 0.05740741 0.1074074  62350
## 30        3       0      412       128 0.7629630 0.23703704 0.4518519  98473
## 54       72       0      263       277 0.4870370 0.51296296 0.4925926 204226
EH23a <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23a$Frequency <- EH23a$ERBfreq
EH23a$facet1 <- "EH23a"

EH23b <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23b$Frequency <- EH23a$HO40freq
EH23b$facet1 <- "EH23b"

Het <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
Het$Frequency <- EH23a$He2
Het$facet1 <- "Het"

afreq <- rbind(EH23a, EH23b, Het)

afreq$facet2 <- afreq$facet1
afreq$facet2 <- sub("EH23[ab]", "EH23", afreq$facet2)
afreq$facet2[ afreq$facet2 == "EH23" ] <- "Allele"
afreq$facet2[ afreq$facet2 == "Het" ] <- "Heterozygosity"
afreq[1:3, ]
##     CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1
## 13 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29
## 30 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122
## 54 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133
##    count1.1 countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2
## 13        1       0      509        31 0.9425926 0.05740741 0.1074074  62350
## 30        3       0      412       128 0.7629630 0.23703704 0.4518519  98473
## 54       72       0      263       277 0.4870370 0.51296296 0.4925926 204226
##    Frequency facet1 facet2
## 13 0.9425926  EH23a Allele
## 30 0.7629630  EH23a Allele
## 54 0.4870370  EH23a Allele
library(ggplot2)

#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.01)
my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)

#as.numeric(as.factor(afreq$CHROM))

#afreq$col2 <- viridisLite::magma( n = 10, alpha = 0.8, begin = 0, end = 1 )[as.numeric(as.factor(afreq$CHROM))]

p <- ggplot( data = afreq, 
             mapping = aes( x = POS2, 
                            #y = ERBfreq,
                            #y = freq,
                            y = Frequency,
                            color = CHROM )
                            #color = col2)
             )
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + geom_hline( yintercept = 0.5, linetype = 1, color = "#000000", linewidth = 1 )
#p + scale_color_manual( values = viridisLite::magma( n = 10, alpha = 0.01, begin = 0.2, end = 0.9 ) )
p <- p + ylim(0.3, 0.7)
p <- p + scale_color_manual( values = my_pal )
#
#p <- 
p + facet_grid(facet1 ~ .)

#p <- 
ahplot <- p + facet_grid(facet2 ~ .)
#p
ahplot

#p + scale_color_manual( values = afreq$col2 )
#+ scale_color_viridis_b()

Figure 1 (Composite plot)

library("ggpubr")

ggarrange(
  plotlist = list(p1, ahplot),
  labels = c("A", "B"),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.